{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## APPLY FINE-TUNED TRANFSORMER MODEL TO ALL STATEMENTS IN THE TEXT CORPUS\n",
    "## Stefan Müller and Naofumi Fujimura\n",
    "## Campaign Communication and Legislative Leadership (PSRM)\n",
    "## Code compiled successfully on 2 February 2024 using Python 3.11.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/smueller/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "arm64\n",
      "Darwin Kernel Version 23.2.0: Wed Nov 15 21:55:06 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6020\n",
      "macOS-14.2.1-arm64-arm-64bit\n",
      "uname_result(system='Darwin', node='Stefans-Mac-Studio.local', release='23.2.0', version='Darwin Kernel Version 23.2.0: Wed Nov 15 21:55:06 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6020', machine='arm64')\n"
     ]
    }
   ],
   "source": [
    "## IMPORT MODULES\n",
    "\n",
    "# Note: you may be asked to install additional modules/dependencies\n",
    "\n",
    "# dealing with Japanese tokenization\n",
    "import fugashi\n",
    "import ipadic\n",
    "\n",
    "## load general libraries\n",
    "import pandas as pd\n",
    "\n",
    "## load relevant functions from transformers library\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
    "\n",
    "## load platform to detect os and get information on machine\n",
    "import platform\n",
    "\n",
    "## get information on system\n",
    "import platform\n",
    "print(platform.machine())\n",
    "print(platform.version())\n",
    "print(platform.platform())\n",
    "print(platform.uname())\n",
    "\n",
    "# arm64\n",
    "# Darwin Kernel Version 23.2.0: Wed Nov 15 21:55:06 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6020\n",
    "# macOS-14.2.1-arm64-arm-64bit\n",
    "# uname_result(system='Darwin', node='Stefans-Mac-Studio.local', release='23.2.0', version='Darwin Kernel Version 23.2.0: Wed Nov 15 21:55:06 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6020', machine='arm64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BertForSequenceClassification(\n",
      "  (bert): BertModel(\n",
      "    (embeddings): BertEmbeddings(\n",
      "      (word_embeddings): Embedding(32000, 768, padding_idx=0)\n",
      "      (position_embeddings): Embedding(512, 768)\n",
      "      (token_type_embeddings): Embedding(2, 768)\n",
      "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
      "      (dropout): Dropout(p=0.1, inplace=False)\n",
      "    )\n",
      "    (encoder): BertEncoder(\n",
      "      (layer): ModuleList(\n",
      "        (0-11): 12 x BertLayer(\n",
      "          (attention): BertAttention(\n",
      "            (self): BertSelfAttention(\n",
      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
      "              (dropout): Dropout(p=0.1, inplace=False)\n",
      "            )\n",
      "            (output): BertSelfOutput(\n",
      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
      "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
      "              (dropout): Dropout(p=0.1, inplace=False)\n",
      "            )\n",
      "          )\n",
      "          (intermediate): BertIntermediate(\n",
      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
      "            (intermediate_act_fn): GELUActivation()\n",
      "          )\n",
      "          (output): BertOutput(\n",
      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
      "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
      "            (dropout): Dropout(p=0.1, inplace=False)\n",
      "          )\n",
      "        )\n",
      "      )\n",
      "    )\n",
      "    (pooler): BertPooler(\n",
      "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
      "      (activation): Tanh()\n",
      "    )\n",
      "  )\n",
      "  (dropout): Dropout(p=0.1, inplace=False)\n",
      "  (classifier): Linear(in_features=768, out_features=12, bias=True)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "# load fine-tuned model created in 03a _finetune_transformer,ipynb\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"bert_manifestos\", num_labels = 12)\n",
    "\n",
    "print(model)\n",
    "\n",
    "# tokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mps\n"
     ]
    }
   ],
   "source": [
    "# detect OS\n",
    "current_os = platform.system()\n",
    "# active device based on OS\n",
    "if current_os == \"Darwin\":\n",
    "    # specify device as mps\n",
    "    device = \"mps\"\n",
    "else:\n",
    "    # check if gpu is available, if yes use cuda, if not stick to cpu\n",
    "    device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
    "    # must be 'cuda:0', not just 'cuda' due to a bug in transformers library, see: https://github.com/deepset-ai/haystack/issues/3160\n",
    "\n",
    "# check which device is used (GPU or CPU)\n",
    "# if device = mps, then GPU is used on an Apple Silicon Mac or MacBook\n",
    "print(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a pipeline for text processing\n",
    "classifier = pipeline(\"text-classification\",\n",
    "                      model = model,\n",
    "                      max_length = 512,\n",
    "                      batch_size = 64,\n",
    "                      device = device,\n",
    "                      padding = 'max_length',\n",
    "                      truncation = True,\n",
    "                      tokenizer = tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "59619\n"
     ]
    }
   ],
   "source": [
    "## ## LOAD DATASET WITH ALL SENTENCES AND PREPARE FOR CLASSIFICAITON\n",
    "\n",
    "# load the CSV file with held-out test data\n",
    "df_all = pd.read_csv('data_sentences_all.csv')\n",
    "\n",
    "# check number observations\n",
    "num_rows_all = df_all.shape[0]\n",
    "print(num_rows_all)\n",
    "\n",
    "# ensure that the column containing the text is of type str\n",
    "df_all['text'] = df_all['text'].astype(str)\n",
    "\n",
    "text_data_all = df_all['text'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[9], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# PREDICT ALL SENTENCES AND BIND PREDICTIONS TO DATA FRAME\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# apply classifier to all data (will take a few minutes to complete)\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m results_all \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\u001b[43mclassifier\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext_data_all\u001b[49m\u001b[43m)\u001b[49m)\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/pipelines/text_classification.py:156\u001b[0m, in \u001b[0;36mTextClassificationPipeline.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    122\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    123\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    124\u001b[0m \u001b[38;5;124;03m    Classify the text(s) given as inputs.\u001b[39;00m\n\u001b[1;32m    125\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    154\u001b[0m \u001b[38;5;124;03m        If `top_k` is used, one such dictionary is returned per label.\u001b[39;00m\n\u001b[1;32m    155\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    157\u001b[0m     \u001b[38;5;66;03m# TODO try and retrieve it in a nicer way from _sanitize_parameters.\u001b[39;00m\n\u001b[1;32m    158\u001b[0m     _legacy \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtop_k\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m kwargs\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/pipelines/base.py:1121\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[0;34m(self, inputs, num_workers, batch_size, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1117\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m can_use_iterator:\n\u001b[1;32m   1118\u001b[0m     final_iterator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_iterator(\n\u001b[1;32m   1119\u001b[0m         inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params\n\u001b[1;32m   1120\u001b[0m     )\n\u001b[0;32m-> 1121\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfinal_iterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1122\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n\u001b[1;32m   1123\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py:124\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    121\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_item()\n\u001b[1;32m    123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[0;32m--> 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[1;32m    125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfer(item, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparams)\n\u001b[1;32m    126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py:124\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    121\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_item()\n\u001b[1;32m    123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[0;32m--> 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[1;32m    125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfer(item, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparams)\n\u001b[1;32m    126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/torch/utils/data/dataloader.py:630\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    627\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    628\u001b[0m     \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m    629\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()  \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 630\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    631\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    632\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    633\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    634\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/torch/utils/data/dataloader.py:674\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    672\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    673\u001b[0m     index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index()  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 674\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m    675\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m    676\u001b[0m         data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py:51\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m     49\u001b[0m         data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m     50\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 51\u001b[0m         data \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpossibly_batched_index\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py:51\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     49\u001b[0m         data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m     50\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 51\u001b[0m         data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py:19\u001b[0m, in \u001b[0;36mPipelineDataset.__getitem__\u001b[0;34m(self, i)\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, i):\n\u001b[1;32m     18\u001b[0m     item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[i]\n\u001b[0;32m---> 19\u001b[0m     processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     20\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m processed\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/pipelines/text_classification.py:180\u001b[0m, in \u001b[0;36mTextClassificationPipeline.preprocess\u001b[0;34m(self, inputs, **tokenizer_kwargs)\u001b[0m\n\u001b[1;32m    174\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(inputs, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m    175\u001b[0m     \u001b[38;5;66;03m# This is likely an invalid usage of the pipeline attempting to pass text pairs.\u001b[39;00m\n\u001b[1;32m    176\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    177\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    178\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m dictionary `\u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMy text\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext_pair\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMy pair\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m}` in order to send a text pair.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    179\u001b[0m     )\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtokenizer_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2802\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   2800\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_in_target_context_manager:\n\u001b[1;32m   2801\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_switch_to_input_mode()\n\u001b[0;32m-> 2802\u001b[0m     encodings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mall_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2803\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_target \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   2804\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_switch_to_target_mode()\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2908\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase._call_one\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   2888\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_encode_plus(\n\u001b[1;32m   2889\u001b[0m         batch_text_or_text_pairs\u001b[38;5;241m=\u001b[39mbatch_text_or_text_pairs,\n\u001b[1;32m   2890\u001b[0m         add_special_tokens\u001b[38;5;241m=\u001b[39madd_special_tokens,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2905\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   2906\u001b[0m     )\n\u001b[1;32m   2907\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2908\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2909\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2910\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2911\u001b[0m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2912\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2913\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2914\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2915\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2916\u001b[0m \u001b[43m        \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2917\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2918\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2919\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2920\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2921\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2922\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2923\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2924\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2925\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2926\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2927\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2981\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   2971\u001b[0m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m   2972\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m   2973\u001b[0m     padding\u001b[38;5;241m=\u001b[39mpadding,\n\u001b[1;32m   2974\u001b[0m     truncation\u001b[38;5;241m=\u001b[39mtruncation,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2978\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   2979\u001b[0m )\n\u001b[0;32m-> 2981\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2982\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2983\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2984\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2985\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2986\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2987\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2988\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2989\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2990\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2991\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2992\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2993\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2994\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2995\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2996\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2997\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2998\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2999\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3000\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils.py:722\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m    719\u001b[0m first_ids \u001b[38;5;241m=\u001b[39m get_input_ids(text)\n\u001b[1;32m    720\u001b[0m second_ids \u001b[38;5;241m=\u001b[39m get_input_ids(text_pair) \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 722\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_for_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    723\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfirst_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    724\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpair_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msecond_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    725\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    726\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    727\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    728\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    729\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    730\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    731\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    732\u001b[0m \u001b[43m    \u001b[49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    733\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    734\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    735\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    736\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    737\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    738\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    739\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.prepare_for_model\u001b[0;34m(self, ids, pair_ids, add_special_tokens, padding, truncation, max_length, stride, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, prepend_batch_axis, **kwargs)\u001b[0m\n\u001b[1;32m   3468\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m return_length:\n\u001b[1;32m   3469\u001b[0m     encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlength\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m-> 3471\u001b[0m batch_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mBatchEncoding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3472\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoded_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtensor_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepend_batch_axis\u001b[49m\n\u001b[1;32m   3473\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3475\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m batch_outputs\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:223\u001b[0m, in \u001b[0;36mBatchEncoding.__init__\u001b[0;34m(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)\u001b[0m\n\u001b[1;32m    219\u001b[0m     n_sequences \u001b[38;5;241m=\u001b[39m encoding[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mn_sequences\n\u001b[1;32m    221\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_n_sequences \u001b[38;5;241m=\u001b[39m n_sequences\n\u001b[0;32m--> 223\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_to_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensor_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtensor_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepend_batch_axis\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:748\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m    745\u001b[0m     value \u001b[38;5;241m=\u001b[39m [value]\n\u001b[1;32m    747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_tensor(value):\n\u001b[0;32m--> 748\u001b[0m     tensor \u001b[38;5;241m=\u001b[39m \u001b[43mas_tensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    750\u001b[0m     \u001b[38;5;66;03m# Removing this for now in favor of controlling the shape with `prepend_batch_axis`\u001b[39;00m\n\u001b[1;32m    751\u001b[0m     \u001b[38;5;66;03m# # at-least2d\u001b[39;00m\n\u001b[1;32m    752\u001b[0m     \u001b[38;5;66;03m# if tensor.ndim > 2:\u001b[39;00m\n\u001b[1;32m    753\u001b[0m     \u001b[38;5;66;03m#     tensor = tensor.squeeze(0)\u001b[39;00m\n\u001b[1;32m    754\u001b[0m     \u001b[38;5;66;03m# elif tensor.ndim < 2:\u001b[39;00m\n\u001b[1;32m    755\u001b[0m     \u001b[38;5;66;03m#     tensor = tensor[None, :]\u001b[39;00m\n\u001b[1;32m    757\u001b[0m     \u001b[38;5;28mself\u001b[39m[key] \u001b[38;5;241m=\u001b[39m tensor\n",
      "File \u001b[0;32m~/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:720\u001b[0m, in \u001b[0;36mBatchEncoding.convert_to_tensors.<locals>.as_tensor\u001b[0;34m(value, dtype)\u001b[0m\n\u001b[1;32m    718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value, \u001b[38;5;28mlist\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value[\u001b[38;5;241m0\u001b[39m], np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m    719\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39marray(value))\n\u001b[0;32m--> 720\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# PREDICT ALL SENTENCES AND BIND PREDICTIONS TO DATA FRAME\n",
    "\n",
    "# apply classifier to all data (will take a few minutes to complete)\n",
    "results_all = pd.DataFrame(classifier(text_data_all))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove the 'LABEL_' prefix from the 'label' column\n",
    "results_all['label'] = results_all['label'].str.replace('LABEL_', '')\n",
    "\n",
    "df_classified_all = df_all\n",
    "\n",
    "# add the results back to the original DataFrame\n",
    "df_classified_all['label'] = results_all['label']\n",
    "df_classified_all['score'] = results_all['score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "59619\n"
     ]
    }
   ],
   "source": [
    "# check number observations\n",
    "num_rows_all_classified = df_classified_all.shape[0]\n",
    "print(num_rows_all_classified)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the DataFrame with results back to a new CSV file\n",
    "df_classified_all.to_csv('data_predicted_full.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
